pbmc.data <- Read10X(data.dir = "/home/nastasista/Desktop/RNAseq/sc_RNA/t")
pbmc <- CreateSeuratObject(counts = pbmc.data, project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat
## 19420 features across 7903 samples within 1 assay
## Active assay: RNA (19420 features, 0 variable features)
## 2 layers present: counts, data
rm(pbmc.data)
# QC
meta <- pbmc@meta.data
dim(meta)
## [1] 7903 3
head(meta)
Объект Seurat(PBMC) содержит 19420 генов и 7903 клетки (features)
summary(meta$nCount_RNA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 500 2617 3206 3715 4092 29348
summary(meta$nFeature_RNA)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 200 953 1131 1279 1386 5601
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
pbmc[["percent.rb"]] <- PercentageFeatureSet(pbmc, pattern = "^RP[SL]")
head(pbmc[[]])
VlnPlot(pbmc, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),ncol = 4,pt.size = 0.1) &
theme(plot.title = element_text(size=10))
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2
FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.rb")
FeatureScatter(pbmc, feature1 = "percent.rb", feature2 = "percent.mt")
Фильтрация: nFeature_RNA(кол-во генов) больше 200 и меньше 2500(если выше - вероятно это дуплеты), фильтрация по митохондриальному контенту < 15, в соответсвии с предыдущим графиком(VlnPlot)
pbmc <- subset(pbmc, subset = nFeature_RNA > 200 &
nFeature_RNA < 2500 &
percent.mt < 15)
meta <- pbmc@meta.data
dim(meta)
## [1] 7396 5
После фильтрации 7396 клеток
VlnPlot(pbmc, features = c("nFeature_RNA","nCount_RNA","percent.mt","percent.rb"),ncol = 4, pt.size = 0.1) &
theme(plot.title = element_text(size= 10))
pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
top10 <- head(VariableFeatures(pbmc), 10)
top10 #топ 10 вариабельных генов
## [1] "S100A9" "LYZ" "S100A8" "IGKC" "PPBP" "IGLC3" "IGLC2" "PF4"
## [9] "HBB" "IGLC1"
Plot variable features with and without labels:
plot1 <- VariableFeaturePlot(pbmc)
LabelPoints(plot = plot1, points = top10, repel = TRUE, xnudge = 0, ynudge = 0)
## Warning: Transformation introduced infinite values in continuous x-axis
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: RPS12, RPS18, IL7R, LTB, RPL10, RPL13, MALAT1, IL32, TRBC2, TPT1
## EEF1A1, TRBC1, ARL4C, LEF1, RPLP1, MAL, BCL2, CD69, RORA, SYNE2
## FHIT, AQP3, GZMM, CD8B, NELL2, LINC02446, ARID5B, TSHZ2, AP3M2, TBC1D4
## Negative: FCN1, CST3, IFI30, CYBB, MNDA, LYZ, FGL2, VCAN, S100A9, MPEG1
## SERPINA1, NCF2, S100A8, AC020656.1, MS4A6A, CD14, CTSS, LST1, CLEC7A, CD68
## CFD, SPI1, CSTA, TNFAIP2, LGALS2, PSAP, CD302, CPVL, S100A12, GRN
## PC_ 2
## Positive: EEF1A1, MALAT1, RPL13, RPS18, RPL10, RPS12, RPLP1, TPT1, LTB, VIM
## IL7R, CD37, JUN, JUNB, HLA-DRA, AIF1, NFKBIZ, HLA-DQB1, S100A6, CD74
## MARCH1, SAMHD1, MAL, POU2F2, S100A10, CTSS, HLA-DRB5, NFKBIA, DUSP1, CYBB
## Negative: GP1BB, TUBB1, CAVIN2, GNG11, GP9, PF4, PPBP, PRKAR2B, NRGN, ACRBP
## PTCRA, CLU, F13A1, CMTM5, SPARC, RGS18, C2orf88, HIST1H2AC, TREML1, AC147651.1
## TMEM40, MMD, MPIG6B, ITGA2B, DAB2, RUFY1, MAP3K7CL, PF4V1, TSC22D1, CLEC1B
## PC_ 3
## Positive: CD79A, IGHM, MS4A1, BANK1, IGHD, IGKC, RALGPS2, BCL11A, TNFRSF13C, LINC00926
## CD22, CD79B, HLA-DQA1, HLA-DRA, PAX5, LTB, AFF3, TCL1A, COBLL1, FCRL1
## FCER2, IGLC2, VPREB3, NIBAN3, MEF2C, BLK, ADAM28, MARCH1, HLA-DQB1, TCF4
## Negative: NKG7, GNLY, CST7, GZMA, FGFBP2, GZMB, KLRD1, PRF1, GZMH, CTSW
## CCL5, EFHD2, HOPX, ADGRG1, FCGR3A, CCL4, SPON2, KLRF1, TRDC, S100A4
## GZMM, S1PR5, IL2RB, ARL4C, ID2, CX3CR1, KLRK1, TYROBP, CLIC3, KLRB1
## PC_ 4
## Positive: MS4A1, IGHM, CD79A, BANK1, IGHD, HLA-DPB1, CD79B, IGKC, NKG7, HLA-DPA1
## CD22, BCL11A, GNLY, LINC00926, TNFRSF13C, RALGPS2, GZMB, HLA-DQA1, CST7, PAX5
## FGFBP2, COBLL1, GZMA, CD74, KLRD1, HLA-DRA, PRF1, TCL1A, GZMH, ADAM28
## Negative: IL7R, TPT1, MAL, LEF1, LTB, VIM, IL6ST, RPS12, RPL13, EEF1A1
## FHIT, AIF1, RPLP1, IL32, CD4, RPS18, SAMHD1, AQP3, JUN, NFKBIZ
## JAML, TBC1D4, VCAN, TNFAIP3, S100A12, S100A8, NELL2, RPL10, NAP1L1, TOB1
## PC_ 5
## Positive: S100A12, VCAN, S100A8, CD14, AC020656.1, MS4A6A, CSF3R, CD36, AC020916.1, CXCL8
## CLEC4E, ALDH1A1, PLBD1, MGST1, NCF1, LGALS2, CREB5, IER3, CYP1B1, RNASE6
## ALDH2, FOSB, CD163, CD93, TREM1, IL1B, GAS7, CCL3, S100A9, MNDA
## Negative: CDKN1C, HES4, TCF7L2, SMIM25, CSF1R, AC104809.2, MS4A7, LYPD2, HMOX1, FCGR3A
## ZNF703, SIGLEC10, CKB, VMO1, IFITM3, PPM1N, MS4A4A, RHOC, LINC02432, MTSS1
## FMNL2, IL3RA, MEG3, LRRC25, NEURL1, BATF3, CTSL, MAFB, RRAS, AC064805.1
DimPlot(pbmc, reduction = "pca")
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: RPS12, RPS18, IL7R, LTB, RPL10
## Negative: FCN1, CST3, IFI30, CYBB, MNDA
## PC_ 2
## Positive: EEF1A1, MALAT1, RPL13, RPS18, RPL10
## Negative: GP1BB, TUBB1, CAVIN2, GNG11, GP9
## PC_ 3
## Positive: CD79A, IGHM, MS4A1, BANK1, IGHD
## Negative: NKG7, GNLY, CST7, GZMA, FGFBP2
## PC_ 4
## Positive: MS4A1, IGHM, CD79A, BANK1, IGHD
## Negative: IL7R, TPT1, MAL, LEF1, LTB
## PC_ 5
## Positive: S100A12, VCAN, S100A8, CD14, AC020656.1
## Negative: CDKN1C, HES4, TCF7L2, SMIM25, CSF1R
VizDimLoadings(pbmc, dims = 1:9, reduction = "pca") &
theme(axis.text=element_text(size=5), axis.title=element_text(size=8,face="bold"))
# Heatmaps of these genes
DimHeatmap(pbmc, dims = 1:12, nfeatures = 20, cells = 500, balanced = T)
ElbowPlot показывает какое количество принципиальных компонент объясняет вариабельность данных
ElbowPlot(pbmc)
pbmc <- FindNeighbors(pbmc, dims = 1:10) #ищем соседей по кластеру
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.4) # Resolution may vary ~0.4-1.2, depending on how well (biologically) it describes clusters
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 7396
## Number of edges: 244698
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9140
## Number of communities: 11
## Elapsed time: 1 seconds
pbmc <- RunUMAP(pbmc, dims = 1:10, verbose = F)
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
## Found more than one class "dist" in cache; using the first, from namespace 'spam'
## Also defined by 'BiocGenerics'
head(Idents(pbmc), 5)
## AAACCCAAGCGTGAGT-1 AAACCCAGTCGACGCT-1 AAACCCATCAAGAGTA-1 AAACCCATCCACGTCT-1
## 1 0 0 0
## AAACCCATCCCTCTAG-1
## 1
## Levels: 0 1 2 3 4 5 6 7 8 9 10
Кластеры и количество клеток в каждом кластере
table(pbmc@meta.data$seurat_clusters)
##
## 0 1 2 3 4 5 6 7 8 9 10
## 2347 1561 809 745 498 387 384 333 176 108 48
DimPlot(pbmc,label.size = 4,repel = T,label = T)
FeaturePlot(pbmc, features = c("S100A9", "IGLC1", "IGHM", "PF4"))
# QC check
FeaturePlot(pbmc, features = "percent.mt") & theme(plot.title = element_text(size=10))
FeaturePlot(pbmc, features = "percent.rb") & theme(plot.title = element_text(size=10))
FeaturePlot(pbmc, features = "nFeature_RNA") & theme(plot.title = element_text(size=10))
FeaturePlot(pbmc, features = "nCount_RNA") & theme(plot.title = element_text(size=10))
Гены клеточного цикла:
cc.genes.updated.2019
## $s.genes
## [1] "MCM5" "PCNA" "TYMS" "FEN1" "MCM7" "MCM4"
## [7] "RRM1" "UNG" "GINS2" "MCM6" "CDCA7" "DTL"
## [13] "PRIM1" "UHRF1" "CENPU" "HELLS" "RFC2" "POLR1B"
## [19] "NASP" "RAD51AP1" "GMNN" "WDR76" "SLBP" "CCNE2"
## [25] "UBR7" "POLD3" "MSH2" "ATAD2" "RAD51" "RRM2"
## [31] "CDC45" "CDC6" "EXO1" "TIPIN" "DSCC1" "BLM"
## [37] "CASP8AP2" "USP1" "CLSPN" "POLA1" "CHAF1B" "MRPL36"
## [43] "E2F8"
##
## $g2m.genes
## [1] "HMGB2" "CDK1" "NUSAP1" "UBE2C" "BIRC5" "TPX2" "TOP2A"
## [8] "NDC80" "CKS2" "NUF2" "CKS1B" "MKI67" "TMPO" "CENPF"
## [15] "TACC3" "PIMREG" "SMC4" "CCNB2" "CKAP2L" "CKAP2" "AURKB"
## [22] "BUB1" "KIF11" "ANP32E" "TUBB4B" "GTSE1" "KIF20B" "HJURP"
## [29] "CDCA3" "JPT1" "CDC20" "TTK" "CDC25C" "KIF2C" "RANGAP1"
## [36] "NCAPD2" "DLGAP5" "CDCA2" "CDCA8" "ECT2" "KIF23" "HMMR"
## [43] "AURKA" "PSRC1" "ANLN" "LBR" "CKAP5" "CENPE" "CTCF"
## [50] "NEK2" "G2E3" "GAS2L3" "CBX5" "CENPA"
s.genes <- cc.genes.updated.2019$s.genes
g2m.genes <- cc.genes.updated.2019$g2m.genes
pbmc <- CellCycleScoring(pbmc, s.features = s.genes, g2m.features = g2m.genes)
## Warning: The following features are not present in the object: E2F8, not
## searching for symbol synonyms
## Warning: The following features are not present in the object: PIMREG, CDC25C,
## NEK2, not searching for symbol synonyms
table(pbmc[[]]$Phase)
##
## G1 G2M S
## 2856 1867 2673
DimPlot(pbmc, group.by = "Phase")
DimPlot(pbmc, split.by = "Phase")
FeaturePlot(pbmc,features = c("S.Score","G2M.Score"),label.size = 4,repel = T,label = T) &
theme(plot.title = element_text(size=10))
VlnPlot(pbmc,features = c("S.Score","G2M.Score")) &
theme(plot.title = element_text(size=10))
cluster2.markers <- FindMarkers(pbmc, ident.1 = 2, min.pct = 0.25)
head(cluster2.markers, n = 5)
Второй кластер вероятно принадлежит T-лимфоцитам
Топ 2 гена-маркера для каждого из кластеров:
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
pbmc.markers %>%
group_by(cluster) %>%
slice_max(n = 2, order_by = avg_log2FC)
Кластер 0
FeaturePlot(pbmc, features = c("IL6ST", "FHIT")) #cluster 0 - Endothelial cells
VlnPlot(pbmc, features = c("IL6ST", "FHIT"))
Кластер 1
FeaturePlot(pbmc, features = c("TSHZ2", "BRWD1")) #cluster 1 - Neurons ???
VlnPlot(pbmc, features = c("TSHZ2", "BRWD1"))
Кластеры 2, 3, 5, 9
FeaturePlot(pbmc, features = c("LINC02446", "CD8B")) #cluster 2 - T-cells
FeaturePlot(pbmc, features = c("GZMH", "TRDC")) #cluster 3 - Gamma delta T + NK cells
FeaturePlot(pbmc, features = c("IGKC", "IGHM")) #cluster 5 - B-cells
FeaturePlot(pbmc, features = c("CDKN1C", "LYPD2")) #cluster 9 - Fibroblasts + EC
pbmc.markers %>%
group_by(cluster) %>%
top_n(n = 10, wt = avg_log2FC) -> top10
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
new.cluster.ids <- c("Endothelial cells", "Neurons", "T-cells", "Gamma delta T + NK cells", "Cluster 4", "B-cells", "Cluster 6", "Cluster 7", "Cluster 8", "Fibroblasts + EC", "Cluster 10")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
monaco.ref <- celldex::MonacoImmuneData()
## snapshotDate(): 2022-10-31
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
sce <- as.SingleCellExperiment(DietSeurat(pbmc))
## Warning: `invoke()` is deprecated as of rlang 0.4.0.
## Please use `exec()` or `inject()` instead.
## This warning is displayed once every 8 hours.
monaco.main <- SingleR(test = sce,assay.type.test = 1,ref = monaco.ref,labels = monaco.ref$label.main)
monaco.fine <- SingleR(test = sce,assay.type.test = 1,ref = monaco.ref,labels = monaco.ref$label.fine)
table(monaco.main$pruned.labels)
##
## B cells Basophils CD4+ T cells CD8+ T cells Dendritic cells
## 357 5 3431 1253 16
## Monocytes Neutrophils NK cells Progenitors T cells
## 581 35 597 66 982
pbmc@meta.data$monaco.main <- monaco.main$pruned.labels
pbmc@meta.data$monaco.fine <- monaco.fine$pruned.labels
pbmc <- SetIdent(pbmc, value = "monaco.main")
DimPlot(pbmc, label = T , repel = T, label.size = 3) + NoLegend()